ISL Exercise 8.4.3 (10pts)
library (tidyverse)
p1 <- seq (0 , 1 , 0.01 )
p2 <- 1 - p1
# define the function of classification error
classification_error <- 1 - pmax (p1, p2)
# define the function of Gini index
gini <- p1* (1 - p1) + p2* (1 - p2)
# define the function of entropy
entropy <- - p1* log (p1) - p2* log (p2)
data.frame (p1, p2, classification_error, gini, entropy) %>%
pivot_longer (cols = c (classification_error, gini, entropy), names_to = "metrics" ) %>%
ggplot (aes (x = p1, y = value, col = factor (metrics))) +
geom_line () +
scale_y_continuous (breaks = seq (0 , 1 , 0.1 )) +
scale_color_hue (labels = c ("Classification Error" , "Entropy" , "Gini" )) +
labs (col = "Metrics" ,
y = "Value" ,
x = "P1" )
Warning: Removed 2 rows containing missing values (`geom_line()`).
ISL Exercise 8.4.4 (10pts)
Here is my answer to this question:
ISL Exercise 8.4.5 (10pts)
Method one: Majority vote approach
probs <- c (0.1 , 0.15 , 0.2 , 0.2 , 0.55 , 0.6 , 0.6 , 0.65 , 0.7 , 0.75 )
cat ("The number of red observations is: " , sum (probs >= 0.5 ), " \n " )
The number of red observations is: 6
cat ("The number of green observations is: " , sum (probs < 0.5 ))
The number of green observations is: 4
The final classification under the majority vote approach is red.
Method two: Average probability approach
The average probability is 0.45, which is less than 0.5. The final classification under the average probability approach is green.
ISL Lab 8.3. Boston data set (30pts)
Follow the machine learning workflow to train regression tree, random forest, and boosting methods for predicting medv. Evaluate out-of-sample performance on a test set.
Answer:
Regression tree
Load and summary data
library (GGally)
library (tidyverse)
library (tidymodels)
library (ISLR2)
# Load the Boston data set
data (Boston)
# Summary of the data
summary (Boston)
crim zn indus chas
Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
nox rm age dis
Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
rad tax ptratio lstat
Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
Median : 5.000 Median :330.0 Median :19.05 Median :11.36
Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
medv
Min. : 5.00
1st Qu.:17.02
Median :21.20
Mean :22.53
3rd Qu.:25.00
Max. :50.00
ggpairs (Boston, lower= list (continuous= wrap ("points" , alpha= 0.3 , size= 0.3 )),
diag= list (continuous= 'barDiag' )) +
theme (axis.text.x = element_text (angle = 45 , vjust = 1 , hjust= 1 )) +
scale_x_continuous (breaks = scales:: pretty_breaks (n = 3 )) +
scale_y_continuous (breaks = scales:: pretty_breaks (n = 3 ))
Split the data into training and test sets
set.seed (123 )
data_split <- initial_split (
Boston,
prop = 0.75
)
Boston_train <- training (data_split)
dim (Boston_train)
Boston_test <- testing (data_split)
dim (Boston_test)
Build recipe
rec <- recipe (medv ~ ., data = Boston_train) %>%
step_dummy (all_nominal ()) %>%
step_normalize (all_numeric_predictors ())
Train regression tree
Regressuib tree model
regtree_mod <- decision_tree (
cost_complexity = tune (),
tree_depth = tune (),
min_n = 10 ,
mode = "regression" ,
engine = "rpart"
)
Set workflow
regtree_wf <- workflow () %>%
add_recipe (rec) %>%
add_model (regtree_mod)
regtree_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Decision Tree Model Specification (regression)
Main Arguments:
cost_complexity = tune()
tree_depth = tune()
min_n = 10
Computational engine: rpart
Turning grid
regtree_grid <- grid_regular (cost_complexity (range = c (- 10 , - 2 )), # set tuning range
tree_depth (range = c (1 , 10 )),
levels = c (100 , 5 ))
10-fold Cross-validation
set.seed (123 )
folds <- vfold_cv (Boston_train, v = 10 )
folds
# 10-fold cross-validation
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [341/38]> Fold01
2 <split [341/38]> Fold02
3 <split [341/38]> Fold03
4 <split [341/38]> Fold04
5 <split [341/38]> Fold05
6 <split [341/38]> Fold06
7 <split [341/38]> Fold07
8 <split [341/38]> Fold08
9 <split [341/38]> Fold09
10 <split [342/37]> Fold10
Fit cross-validation
regtree_fit <- regtree_wf %>%
tune_grid (
resamples = folds,
grid = regtree_grid,
metrics = metric_set (rmse, rsq)
)
regtree_fit
# Tuning results
# 10-fold cross-validation
# A tibble: 10 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [341/38]> Fold01 <tibble [1,000 × 6]> <tibble [0 × 3]>
2 <split [341/38]> Fold02 <tibble [1,000 × 6]> <tibble [0 × 3]>
3 <split [341/38]> Fold03 <tibble [1,000 × 6]> <tibble [0 × 3]>
4 <split [341/38]> Fold04 <tibble [1,000 × 6]> <tibble [0 × 3]>
5 <split [341/38]> Fold05 <tibble [1,000 × 6]> <tibble [0 × 3]>
6 <split [341/38]> Fold06 <tibble [1,000 × 6]> <tibble [0 × 3]>
7 <split [341/38]> Fold07 <tibble [1,000 × 6]> <tibble [0 × 3]>
8 <split [341/38]> Fold08 <tibble [1,000 × 6]> <tibble [0 × 3]>
9 <split [341/38]> Fold09 <tibble [1,000 × 6]> <tibble [0 × 3]>
10 <split [342/37]> Fold10 <tibble [1,000 × 6]> <tibble [0 × 3]>
Visualize CV results
regtree_fit %>%
collect_metrics () %>%
print (width = Inf ) %>%
filter (.metric == "rmse" ) %>%
mutate (tree_depth = as.factor (tree_depth)) %>%
ggplot (mapping = aes (x = cost_complexity, y = mean, color = tree_depth)) +
geom_point () +
geom_line () +
labs (x = "Cost Complexity" , y = "CV RMSE" )
# A tibble: 1,000 × 8
cost_complexity tree_depth .metric .estimator mean n std_err
<dbl> <int> <chr> <chr> <dbl> <int> <dbl>
1 1 e-10 1 rmse standard 7.54 10 0.265
2 1 e-10 1 rsq standard 0.330 10 0.0352
3 1.20e-10 1 rmse standard 7.54 10 0.265
4 1.20e-10 1 rsq standard 0.330 10 0.0352
5 1.45e-10 1 rmse standard 7.54 10 0.265
6 1.45e-10 1 rsq standard 0.330 10 0.0352
7 1.75e-10 1 rmse standard 7.54 10 0.265
8 1.75e-10 1 rsq standard 0.330 10 0.0352
9 2.10e-10 1 rmse standard 7.54 10 0.265
10 2.10e-10 1 rsq standard 0.330 10 0.0352
.config
<chr>
1 Preprocessor1_Model001
2 Preprocessor1_Model001
3 Preprocessor1_Model002
4 Preprocessor1_Model002
5 Preprocessor1_Model003
6 Preprocessor1_Model003
7 Preprocessor1_Model004
8 Preprocessor1_Model004
9 Preprocessor1_Model005
10 Preprocessor1_Model005
# ℹ 990 more rows
Finalize regression tree model
regtree_fit %>%
show_best ("rmse" )
# A tibble: 5 × 8
cost_complexity tree_depth .metric .estimator mean n std_err .config
<dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00129 10 rmse standard 4.30 10 0.579 Preprocesso…
2 0.00129 7 rmse standard 4.31 10 0.591 Preprocesso…
3 0.000242 7 rmse standard 4.33 10 0.582 Preprocesso…
4 0.000138 7 rmse standard 4.33 10 0.582 Preprocesso…
5 0.000167 7 rmse standard 4.33 10 0.582 Preprocesso…
best_regtree <- regtree_fit %>%
select_best ("rmse" )
best_regtree
# A tibble: 1 × 3
cost_complexity tree_depth .config
<dbl> <int> <chr>
1 0.00129 10 Preprocessor1_Model489
Final workflow
final_regtree_wf <- regtree_wf %>%
finalize_workflow (best_regtree)
final_regtree_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: decision_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Decision Tree Model Specification (regression)
Main Arguments:
cost_complexity = 0.00129154966501489
tree_depth = 10
min_n = 10
Computational engine: rpart
Fit the whole training set, then predict the test cases
final_regtree_fit <-
final_regtree_wf %>%
last_fit (data_split)
# Test metrics
final_regtree_fit %>%
collect_metrics ()
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 4.43 Preprocessor1_Model1
2 rsq standard 0.783 Preprocessor1_Model1
Visualize the final model
library (rpart.plot)
final_regtree <- extract_workflow (final_regtree_fit)
final_regtree %>%
extract_fit_engine () %>%
rpart.plot (roundint = FALSE )
library (vip)
final_regtree %>%
extract_fit_parsnip () %>%
vip ()
Summary: The final regression tree model has depth of 10 and cost complexity of 0.0013. The 3 most important variables are rm, lstat and dis, which means that the average number of rooms per dwelling, lower status of the population, and weighted distances to five Boston employment centers are the most important predictors to predict medv. The model estimate new medv values using the average values of medv in the terminal nodes(leaves) and has a relatively good performance with RMSE of 4.42 and R-squared of 0.78.
Train random forest
Random forest model
rf_mod <-
rand_forest (
mode = "regression" ,
# Number of predictors randomly sampled in each split
mtry = tune (),
# Number of trees in ensemble
trees = tune ()
) %>%
set_engine ("ranger" )
rf_mod
Random Forest Model Specification (regression)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
Set workflow
rf_wf <- workflow () %>%
add_recipe (rec) %>%
add_model (rf_mod)
rf_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
Tuning grid
rf_grid <- grid_regular (
trees (range = c (100 L, 500 L)),
mtry (range = c (2 L, 6 L)),
levels = c (10 , 5 )
)
10-fold Cross-validation
set.seed (123 )
folds <- vfold_cv (Boston_train, v = 10 )
Fit cross-validation
library (ranger)
rf_fit <- rf_wf %>%
tune_grid (
resamples = folds,
grid = rf_grid,
metrics = metric_set (rmse, rsq)
)
rf_fit
# Tuning results
# 10-fold cross-validation
# A tibble: 10 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [341/38]> Fold01 <tibble [100 × 6]> <tibble [0 × 3]>
2 <split [341/38]> Fold02 <tibble [100 × 6]> <tibble [0 × 3]>
3 <split [341/38]> Fold03 <tibble [100 × 6]> <tibble [0 × 3]>
4 <split [341/38]> Fold04 <tibble [100 × 6]> <tibble [0 × 3]>
5 <split [341/38]> Fold05 <tibble [100 × 6]> <tibble [0 × 3]>
6 <split [341/38]> Fold06 <tibble [100 × 6]> <tibble [0 × 3]>
7 <split [341/38]> Fold07 <tibble [100 × 6]> <tibble [0 × 3]>
8 <split [341/38]> Fold08 <tibble [100 × 6]> <tibble [0 × 3]>
9 <split [341/38]> Fold09 <tibble [100 × 6]> <tibble [0 × 3]>
10 <split [342/37]> Fold10 <tibble [100 × 6]> <tibble [0 × 3]>
Visualize CV results
rf_fit %>%
collect_metrics () %>%
print (width = Inf ) %>%
filter (.metric == "rmse" ) %>%
mutate (mtry = as.factor (mtry)) %>%
ggplot (mapping = aes (x = trees, y = mean, color = mtry)) +
geom_point () +
geom_line () +
labs (x = "Number of Trees" , y = "CV RMSE" )
# A tibble: 100 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 2 100 rmse standard 3.55 10 0.399 Preprocessor1_Model01
2 2 100 rsq standard 0.865 10 0.0236 Preprocessor1_Model01
3 2 144 rmse standard 3.51 10 0.425 Preprocessor1_Model02
4 2 144 rsq standard 0.867 10 0.0256 Preprocessor1_Model02
5 2 188 rmse standard 3.60 10 0.419 Preprocessor1_Model03
6 2 188 rsq standard 0.858 10 0.0250 Preprocessor1_Model03
7 2 233 rmse standard 3.61 10 0.435 Preprocessor1_Model04
8 2 233 rsq standard 0.857 10 0.0267 Preprocessor1_Model04
9 2 277 rmse standard 3.62 10 0.429 Preprocessor1_Model05
10 2 277 rsq standard 0.857 10 0.0267 Preprocessor1_Model05
# ℹ 90 more rows
Finalize random forest model
rf_fit %>%
show_best ("rmse" )
# A tibble: 5 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 5 411 rmse standard 3.14 10 0.308 Preprocessor1_Model38
2 5 233 rmse standard 3.14 10 0.307 Preprocessor1_Model34
3 5 188 rmse standard 3.15 10 0.283 Preprocessor1_Model33
4 6 500 rmse standard 3.15 10 0.309 Preprocessor1_Model50
5 6 233 rmse standard 3.15 10 0.294 Preprocessor1_Model44
best_rf <- rf_fit %>%
select_best ("rmse" )
best_rf
# A tibble: 1 × 3
mtry trees .config
<int> <int> <chr>
1 5 411 Preprocessor1_Model38
Final workflow
final_rf_wf <- rf_wf %>%
finalize_workflow (best_rf)
final_rf_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = 5
trees = 411
Computational engine: ranger
Fit the whole training set, then predict the test cases
final_rf_fit <-
final_rf_wf %>%
last_fit (data_split)
final_rf_fit
# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [379/127]> train/test split <tibble> <tibble> <tibble> <workflow>
# Test metrics
final_rf_fit %>%
collect_metrics ()
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 3.61 Preprocessor1_Model1
2 rsq standard 0.844 Preprocessor1_Model1
Summary: Random forest algorithm grow multiple decision trees, the final random forest model has 233 trees and 4 predictors randomly sampled in each split. Random forest estimate new medv values using is the average of all the tree predictions in regression problem. The model has a relatively good performance with RMSE of 3.61 and R-squared of 0.84.
Train boosting
Train boosting model
gb_mod <-
boost_tree (
mode = "regression" ,
trees = 1000 ,
tree_depth = tune (),
learn_rate = tune ()
) %>%
set_engine ("xgboost" )
gb_mod
Boosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
Set workflow
gb_wf <- workflow () %>%
add_recipe (rec) %>%
add_model (gb_mod)
gb_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
Tuning grid
gb_grid <- grid_regular (
tree_depth (range = c (2 L, 10 L)),
learn_rate (range = c (- 3 , - 1 ), trans = log10_trans ()),
levels = c (5 , 10 )
)
gb_grid
# A tibble: 50 × 2
tree_depth learn_rate
<int> <dbl>
1 2 0.001
2 4 0.001
3 6 0.001
4 8 0.001
5 10 0.001
6 2 0.00167
7 4 0.00167
8 6 0.00167
9 8 0.00167
10 10 0.00167
# ℹ 40 more rows
10-fold Cross-validation
set.seed (123 )
folds <- vfold_cv (Boston_train, v = 10 )
Fit cross-validation
library (xgboost)
gb_fit <- gb_wf %>%
tune_grid (
resamples = folds,
grid = gb_grid,
metrics = metric_set (rmse, rsq)
)
gb_fit
# Tuning results
# 10-fold cross-validation
# A tibble: 10 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [341/38]> Fold01 <tibble [100 × 6]> <tibble [0 × 3]>
2 <split [341/38]> Fold02 <tibble [100 × 6]> <tibble [0 × 3]>
3 <split [341/38]> Fold03 <tibble [100 × 6]> <tibble [0 × 3]>
4 <split [341/38]> Fold04 <tibble [100 × 6]> <tibble [0 × 3]>
5 <split [341/38]> Fold05 <tibble [100 × 6]> <tibble [0 × 3]>
6 <split [341/38]> Fold06 <tibble [100 × 6]> <tibble [0 × 3]>
7 <split [341/38]> Fold07 <tibble [100 × 6]> <tibble [0 × 3]>
8 <split [341/38]> Fold08 <tibble [100 × 6]> <tibble [0 × 3]>
9 <split [341/38]> Fold09 <tibble [100 × 6]> <tibble [0 × 3]>
10 <split [342/37]> Fold10 <tibble [100 × 6]> <tibble [0 × 3]>
Visualize CV results
gb_fit %>%
collect_metrics () %>%
print (width = Inf ) %>%
filter (.metric == "rmse" ) %>%
ggplot (mapping = aes (x = learn_rate, y = mean, color = factor (tree_depth))) +
geom_point () +
geom_line () +
labs (x = "Learning Rate" , y = "CV RMSE" ) +
scale_x_log10 ()
# A tibble: 100 × 8
tree_depth learn_rate .metric .estimator mean n std_err
<int> <dbl> <chr> <chr> <dbl> <int> <dbl>
1 2 0.001 rmse standard 9.94 10 0.483
2 2 0.001 rsq standard 0.779 10 0.0310
3 4 0.001 rmse standard 9.85 10 0.446
4 4 0.001 rsq standard 0.817 10 0.0287
5 6 0.001 rmse standard 9.86 10 0.439
6 6 0.001 rsq standard 0.822 10 0.0290
7 8 0.001 rmse standard 9.87 10 0.442
8 8 0.001 rsq standard 0.821 10 0.0297
9 10 0.001 rmse standard 9.87 10 0.442
10 10 0.001 rsq standard 0.821 10 0.0297
.config
<chr>
1 Preprocessor1_Model01
2 Preprocessor1_Model01
3 Preprocessor1_Model02
4 Preprocessor1_Model02
5 Preprocessor1_Model03
6 Preprocessor1_Model03
7 Preprocessor1_Model04
8 Preprocessor1_Model04
9 Preprocessor1_Model05
10 Preprocessor1_Model05
# ℹ 90 more rows
Finalize boosting model
gb_fit %>%
show_best ("rmse" )
# A tibble: 5 × 8
tree_depth learn_rate .metric .estimator mean n std_err .config
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 6 0.0129 rmse standard 3.12 10 0.333 Preprocessor1_Mo…
2 6 0.0215 rmse standard 3.12 10 0.339 Preprocessor1_Mo…
3 6 0.0359 rmse standard 3.13 10 0.337 Preprocessor1_Mo…
4 4 0.0215 rmse standard 3.13 10 0.305 Preprocessor1_Mo…
5 6 0.00774 rmse standard 3.13 10 0.339 Preprocessor1_Mo…
best_gb <- gb_fit %>%
select_best ("rmse" )
best_gb
# A tibble: 1 × 3
tree_depth learn_rate .config
<int> <dbl> <chr>
1 6 0.0129 Preprocessor1_Model28
Final workflow
final_gb_wf <- gb_wf %>%
finalize_workflow (best_gb)
final_gb_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = 6
learn_rate = 0.0129154966501488
Computational engine: xgboost
Fit the whole training set, then predict the test cases
final_gb_fit <-
final_gb_wf %>%
last_fit (data_split)
# Test metrics
final_gb_fit %>%
collect_metrics ()
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 3.71 Preprocessor1_Model1
2 rsq standard 0.840 Preprocessor1_Model1
Summary: Gradient Boosting builds trees in a sequential manner, where each new tree aims to correct the errors made by the previous ones. The final gradient boosting model has 1000 trees, 6 levels of tree depth, and 0.01291 learning rate. The model estimates new medv values by adding trees that predict the residuals or errors of prior trees and it has a relatively good performance with RMSE of 3.71 and R-squared of 0.84.
Compare Performance
final_regtree_fit %>%
collect_metrics () %>%
mutate (model = "Regression Tree" ) %>%
bind_rows (
final_rf_fit %>% collect_metrics () %>%
mutate (model = "Random Forest" ),
final_gb_fit %>% collect_metrics () %>%
mutate (model = "Gradient Boosting" )
) %>%
select (model, .metric, .estimate) %>%
spread (.metric, .estimate)
# A tibble: 3 × 3
model rmse rsq
<chr> <dbl> <dbl>
1 Gradient Boosting 3.71 0.840
2 Random Forest 3.61 0.844
3 Regression Tree 4.43 0.783
ISL Lab 8.3 Carseats data set (30pts)
Follow the machine learning workflow to train classification tree, random forest, and boosting methods for classifying Sales <= 8 versus Sales > 8. Evaluate out-of-sample performance on a test set.
Answer: Load the data and create Sales binary response variable
library (ISLR2)
library (tidyverse)
library (tidymodels)
library (gtsummary)
# Load the Carseats data set
data (Carseats)
Carseats <- Carseats %>%
mutate (Sales_bi = as.factor (ifelse (Sales <= 8 , "No" , "Yes" ))) %>%
select (- Sales)
# Summary of the data
Carseats %>%
tbl_summary (
by = Sales_bi, # stratify by Sales_bi
statistic = list (all_continuous () ~ "{mean} ({sd})" ,
all_categorical () ~ "{n} ({p}%)" ),
missing = "ifany"
)
CompPrice
124 (15)
126 (16)
Income
65 (29)
74 (26)
Advertising
5.0 (5.8)
9.0 (7.1)
Population
260 (147)
273 (148)
Price
123 (22)
106 (23)
ShelveLoc
Bad
82 (35%)
14 (8.5%)
Good
19 (8.1%)
66 (40%)
Medium
135 (57%)
84 (51%)
Age
56 (16)
49 (15)
Education
10
23 (9.7%)
25 (15%)
11
31 (13%)
17 (10%)
12
30 (13%)
19 (12%)
13
26 (11%)
17 (10%)
14
26 (11%)
14 (8.5%)
15
18 (7.6%)
18 (11%)
16
29 (12%)
18 (11%)
17
28 (12%)
21 (13%)
18
25 (11%)
15 (9.1%)
Urban
172 (73%)
110 (67%)
US
135 (57%)
123 (75%)
Carseats %>%
select (- c (ShelveLoc, Urban, US, Sales_bi)) %>%
ggpairs (lower= list (continuous= wrap ("points" , alpha= 0.5 , size= 0.5 )),
diag= list (continuous= 'barDiag' )) +
theme (axis.text.x = element_text (angle = 45 , vjust = 1 , hjust= 1 ))
Split the data into train and test sets
set.seed (123 )
data_split <- initial_split (
Carseats,
prop = 0.8 ,
strata = Sales_bi
)
data_split
<Training/Testing/Total>
<319/81/400>
Carseats_train <- training (data_split)
dim (Carseats_train)
Carseats_test <- testing (data_split)
dim (Carseats_test)
Build recipe
class_rec <-
recipe (
Sales_bi ~ .,
data = Carseats_train
) %>%
step_dummy (all_nominal_predictors ()) %>%
step_normalize (all_numeric_predictors ())
Train classification tree
Classification tree model
classtree_mod <- decision_tree (
cost_complexity = tune (),
tree_depth = tune (),
min_n = 5 ,
mode = "classification" ,
engine = "rpart"
)
Set workflow
classtree_wf <- workflow () %>%
add_recipe (class_rec) %>%
add_model (classtree_mod)
Tuning grid
classtree_grid <- grid_regular (cost_complexity (range = c (- 10 , - 1.5 )),
tree_depth (range = c (1 L, 10 L)),
levels = c (100 ,5 ))
10-fold Cross-validation
set.seed (123 )
folds <- vfold_cv (Carseats_train, v = 10 )
folds
# 10-fold cross-validation
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [287/32]> Fold01
2 <split [287/32]> Fold02
3 <split [287/32]> Fold03
4 <split [287/32]> Fold04
5 <split [287/32]> Fold05
6 <split [287/32]> Fold06
7 <split [287/32]> Fold07
8 <split [287/32]> Fold08
9 <split [287/32]> Fold09
10 <split [288/31]> Fold10
Fit cross-validation
classtree_fit <- classtree_wf %>%
tune_grid (
resamples = folds,
grid = classtree_grid,
metrics = metric_set (accuracy, roc_auc)
)
classtree_fit
# Tuning results
# 10-fold cross-validation
# A tibble: 10 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [287/32]> Fold01 <tibble [1,000 × 6]> <tibble [0 × 3]>
2 <split [287/32]> Fold02 <tibble [1,000 × 6]> <tibble [0 × 3]>
3 <split [287/32]> Fold03 <tibble [1,000 × 6]> <tibble [0 × 3]>
4 <split [287/32]> Fold04 <tibble [1,000 × 6]> <tibble [0 × 3]>
5 <split [287/32]> Fold05 <tibble [1,000 × 6]> <tibble [0 × 3]>
6 <split [287/32]> Fold06 <tibble [1,000 × 6]> <tibble [0 × 3]>
7 <split [287/32]> Fold07 <tibble [1,000 × 6]> <tibble [0 × 3]>
8 <split [287/32]> Fold08 <tibble [1,000 × 6]> <tibble [0 × 3]>
9 <split [287/32]> Fold09 <tibble [1,000 × 6]> <tibble [0 × 3]>
10 <split [288/31]> Fold10 <tibble [1,000 × 6]> <tibble [0 × 3]>
Visualize CV results
classtree_fit %>%
collect_metrics () %>%
print (width = Inf ) %>%
filter (.metric == "roc_auc" ) %>%
mutate (tree_depth = as.factor (tree_depth)) %>%
ggplot (mapping = aes (x = cost_complexity, y = mean, color = tree_depth)) +
geom_point () +
geom_line () +
labs (x = "Cost Complexity" , y = "CV ROC AUC" , color = "tree_depth" )
# A tibble: 1,000 × 8
cost_complexity tree_depth .metric .estimator mean n std_err
<dbl> <int> <chr> <chr> <dbl> <int> <dbl>
1 1 e-10 1 accuracy binary 0.724 10 0.0237
2 1 e-10 1 roc_auc binary 0.681 10 0.0217
3 1.22e-10 1 accuracy binary 0.724 10 0.0237
4 1.22e-10 1 roc_auc binary 0.681 10 0.0217
5 1.48e-10 1 accuracy binary 0.724 10 0.0237
6 1.48e-10 1 roc_auc binary 0.681 10 0.0217
7 1.81e-10 1 accuracy binary 0.724 10 0.0237
8 1.81e-10 1 roc_auc binary 0.681 10 0.0217
9 2.21e-10 1 accuracy binary 0.724 10 0.0237
10 2.21e-10 1 roc_auc binary 0.681 10 0.0217
.config
<chr>
1 Preprocessor1_Model001
2 Preprocessor1_Model001
3 Preprocessor1_Model002
4 Preprocessor1_Model002
5 Preprocessor1_Model003
6 Preprocessor1_Model003
7 Preprocessor1_Model004
8 Preprocessor1_Model004
9 Preprocessor1_Model005
10 Preprocessor1_Model005
# ℹ 990 more rows
Finalize classification tree model
classtree_fit %>%
show_best ("roc_auc" )
# A tibble: 5 × 8
cost_complexity tree_depth .metric .estimator mean n std_err .config
<dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1.75e- 2 7 roc_auc binary 0.764 10 0.0248 Preprocesso…
2 1 e-10 10 roc_auc binary 0.761 10 0.0214 Preprocesso…
3 1.22e-10 10 roc_auc binary 0.761 10 0.0214 Preprocesso…
4 1.48e-10 10 roc_auc binary 0.761 10 0.0214 Preprocesso…
5 1.81e-10 10 roc_auc binary 0.761 10 0.0214 Preprocesso…
best_classtree <- classtree_fit %>%
select_best ("roc_auc" )
best_classtree
# A tibble: 1 × 3
cost_complexity tree_depth .config
<dbl> <int> <chr>
1 0.0175 7 Preprocessor1_Model397
Final workflow
final_classtree_wf <- classtree_wf %>%
finalize_workflow (best_classtree)
Fit the whole training set, then predict the test cases
final_classtree_fit <-
final_classtree_wf %>%
last_fit (data_split)
# Test metrics
final_classtree_fit %>%
collect_metrics ()
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.728 Preprocessor1_Model1
2 roc_auc binary 0.639 Preprocessor1_Model1
library (rpart.plot)
final_classtree <- extract_workflow (final_classtree_fit)
Visulize the final classification tree
final_classtree %>%
extract_fit_engine () %>%
rpart.plot (roundint = FALSE )
Variables importance
library (vip)
final_classtree %>%
extract_fit_parsnip () %>%
vip ()
Summary: The final classification tree model has depth of 7 and cost complexity of 0.0175. The 3 most important variables are ShelveLoc_Good, Price and ComPrice, which means that wether the shelving location is good, the price of the car seat, the price of the competition car seat are the most important predictors to predict if Sales > 8 or not. The model estimate new Sales_bi catergory by the majority class in the terminal nodes(leaves) and has a very good performance with accuracy of 0.81 and ROC AUC of 0.88.
Train random forest
Random forest model
rf_mod <-
rand_forest (
mode = "classification" ,
mtry = tune (),
trees = tune ()
) %>%
set_engine ("ranger" )
rf_mod
Random Forest Model Specification (classification)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
Set workflow
rf_wf <- workflow () %>%
add_recipe (class_rec) %>%
add_model (rf_mod)
rf_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
Tuning grid
rf_grid <- grid_regular (
trees (range = c (100 L, 500 L)),
mtry (range = c (1 L, 5 L)),
levels = c (10 , 5 )
)
10-fold Cross-validation
set.seed (123 )
folds <- vfold_cv (Carseats_train, v = 10 )
folds
# 10-fold cross-validation
# A tibble: 10 × 2
splits id
<list> <chr>
1 <split [287/32]> Fold01
2 <split [287/32]> Fold02
3 <split [287/32]> Fold03
4 <split [287/32]> Fold04
5 <split [287/32]> Fold05
6 <split [287/32]> Fold06
7 <split [287/32]> Fold07
8 <split [287/32]> Fold08
9 <split [287/32]> Fold09
10 <split [288/31]> Fold10
Fit cross-validation
rf_fit <- rf_wf %>%
tune_grid (
resamples = folds,
grid = rf_grid,
metrics = metric_set (accuracy, roc_auc)
)
Visualize CV results
rf_fit %>%
collect_metrics () %>%
print (width = Inf ) %>%
filter (.metric == "roc_auc" ) %>%
mutate (mtry = as.factor (mtry)) %>%
ggplot (mapping = aes (x = trees, y = mean, color = mtry)) +
# geom_point() +
geom_line () +
labs (x = "Num. of Trees" , y = "CV ROC AUC" )
# A tibble: 100 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 100 accuracy binary 0.787 10 0.0213 Preprocessor1_Model01
2 1 100 roc_auc binary 0.849 10 0.0242 Preprocessor1_Model01
3 1 144 accuracy binary 0.777 10 0.0216 Preprocessor1_Model02
4 1 144 roc_auc binary 0.848 10 0.0216 Preprocessor1_Model02
5 1 188 accuracy binary 0.812 10 0.0244 Preprocessor1_Model03
6 1 188 roc_auc binary 0.857 10 0.0209 Preprocessor1_Model03
7 1 233 accuracy binary 0.790 10 0.0235 Preprocessor1_Model04
8 1 233 roc_auc binary 0.859 10 0.0241 Preprocessor1_Model04
9 1 277 accuracy binary 0.793 10 0.0259 Preprocessor1_Model05
10 1 277 roc_auc binary 0.861 10 0.0203 Preprocessor1_Model05
# ℹ 90 more rows
Finalize random forest model
rf_fit %>%
show_best ("roc_auc" )
# A tibble: 5 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 4 233 roc_auc binary 0.892 10 0.0180 Preprocessor1_Model34
2 5 188 roc_auc binary 0.891 10 0.0180 Preprocessor1_Model43
3 4 188 roc_auc binary 0.889 10 0.0175 Preprocessor1_Model33
4 3 144 roc_auc binary 0.889 10 0.0190 Preprocessor1_Model22
5 3 500 roc_auc binary 0.888 10 0.0197 Preprocessor1_Model30
best_rf <- rf_fit %>%
select_best ("roc_auc" )
best_rf
# A tibble: 1 × 3
mtry trees .config
<int> <int> <chr>
1 4 233 Preprocessor1_Model34
Final workflow
final_rf_wf <- rf_wf %>%
finalize_workflow (best_rf)
final_rf_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)
Main Arguments:
mtry = 4
trees = 233
Computational engine: ranger
Fit the whole training set, then predict the test cases
final_rf_fit <-
final_rf_wf %>%
last_fit (data_split)
# Test metrics
final_rf_fit %>%
collect_metrics ()
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.802 Preprocessor1_Model1
2 roc_auc binary 0.877 Preprocessor1_Model1
Summary: Random forest algorithm grow multiple decision trees, the final random forest model has 411 trees and 5 predictors randomly sampled in each split. Random forest estimate new Sales_bi by the majority vote across all trees in classification problem and has a good performance with accuracy of 0.80 and ROC AUC of 0.88.
Train gradient boosting Gradient boosting model
gb_mod <-
boost_tree (
mode = "classification" ,
trees = 1000 ,
tree_depth = tune (),
learn_rate = tune ()
) %>%
set_engine ("xgboost" )
gb_mod
Boosted Tree Model Specification (classification)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
Set workflow
gb_wf <- workflow () %>%
add_recipe (class_rec) %>%
add_model (gb_mod)
gb_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (classification)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
Tuning grid
gb_grid <- grid_regular (
tree_depth (range = c (1 L, 5 L)),
learn_rate (range = c (- 5 , 0 ), trans = log10_trans ()),
levels = c (5 , 20 )
)
gb_grid
# A tibble: 100 × 2
tree_depth learn_rate
<int> <dbl>
1 1 0.00001
2 2 0.00001
3 3 0.00001
4 4 0.00001
5 5 0.00001
6 1 0.0000183
7 2 0.0000183
8 3 0.0000183
9 4 0.0000183
10 5 0.0000183
# ℹ 90 more rows
10-fold Cross-validation
set.seed (123 )
folds <- vfold_cv (Carseats_train, v = 10 )
Fit cross-validation
gb_fit <- gb_wf %>%
tune_grid (
resamples = folds,
grid = gb_grid,
metrics = metric_set (accuracy, roc_auc)
)
Visualize CV results
gb_fit %>%
collect_metrics () %>%
print (width = Inf ) %>%
filter (.metric == "roc_auc" ) %>%
mutate (tree_depth = as.factor (tree_depth)) %>%
ggplot (mapping = aes (x = learn_rate, y = mean, color = tree_depth)) +
geom_point () +
geom_line () +
labs (x = "Learning Rate" , y = "CV ROC AUC" , color = "tree_depth" )
# A tibble: 200 × 8
tree_depth learn_rate .metric .estimator mean n std_err
<int> <dbl> <chr> <chr> <dbl> <int> <dbl>
1 1 0.00001 accuracy binary 0.724 10 0.0237
2 1 0.00001 roc_auc binary 0.681 10 0.0217
3 2 0.00001 accuracy binary 0.721 10 0.0248
4 2 0.00001 roc_auc binary 0.740 10 0.0227
5 3 0.00001 accuracy binary 0.702 10 0.0222
6 3 0.00001 roc_auc binary 0.722 10 0.0249
7 4 0.00001 accuracy binary 0.736 10 0.0234
8 4 0.00001 roc_auc binary 0.767 10 0.0249
9 5 0.00001 accuracy binary 0.746 10 0.0272
10 5 0.00001 roc_auc binary 0.780 10 0.0200
.config
<chr>
1 Preprocessor1_Model001
2 Preprocessor1_Model001
3 Preprocessor1_Model002
4 Preprocessor1_Model002
5 Preprocessor1_Model003
6 Preprocessor1_Model003
7 Preprocessor1_Model004
8 Preprocessor1_Model004
9 Preprocessor1_Model005
10 Preprocessor1_Model005
# ℹ 190 more rows
Finalize gradient boosting model
gb_fit %>%
show_best ("roc_auc" )
# A tibble: 5 × 8
tree_depth learn_rate .metric .estimator mean n std_err .config
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 0.0886 roc_auc binary 0.932 10 0.0163 Preprocessor1_Mo…
2 1 0.0483 roc_auc binary 0.929 10 0.0176 Preprocessor1_Mo…
3 1 0.162 roc_auc binary 0.928 10 0.0168 Preprocessor1_Mo…
4 2 0.0264 roc_auc binary 0.925 10 0.0150 Preprocessor1_Mo…
5 2 0.0483 roc_auc binary 0.925 10 0.0153 Preprocessor1_Mo…
best_gb <- gb_fit %>%
select_best ("roc_auc" )
best_gb
# A tibble: 1 × 3
tree_depth learn_rate .config
<int> <dbl> <chr>
1 1 0.0886 Preprocessor1_Model076
Final workflow
final_gb_wf <- gb_wf %>%
finalize_workflow (best_gb)
final_gb_wf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
2 Recipe Steps
• step_dummy()
• step_normalize()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (classification)
Main Arguments:
trees = 1000
tree_depth = 1
learn_rate = 0.0885866790410082
Computational engine: xgboost
Fit the whole training set, then predict the test cases
final_gb_fit <-
final_gb_wf %>%
last_fit (data_split)
# Test metrics
final_gb_fit %>%
collect_metrics ()
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.877 Preprocessor1_Model1
2 roc_auc binary 0.946 Preprocessor1_Model1
Summary: Gradient Boosting builds trees in a sequential manner, where each new tree aims to correct the errors made by the previous ones. The final gradient boosting model has 1000 trees, tree depth of 1, and 0.0886 learning rate. The model estimates new Sales_bi catergory by adding trees that predict the residuals or errors of prior trees and it has the best performance with accuracy of 0.88 and ROC AUC of 0.95 in test set.
Compare Performance
final_classtree_fit %>%
collect_metrics () %>%
mutate (model = "Classification Tree" ) %>%
bind_rows (
final_rf_fit %>% collect_metrics () %>%
mutate (model = "Random Forest" ),
final_gb_fit %>% collect_metrics () %>%
mutate (model = "Gradient Boosting" )
) %>%
select (model, .metric, .estimate) %>%
spread (.metric, .estimate)
# A tibble: 3 × 3
model accuracy roc_auc
<chr> <dbl> <dbl>
1 Classification Tree 0.728 0.639
2 Gradient Boosting 0.877 0.946
3 Random Forest 0.802 0.877